This notebook contains code to execute plotting and exploratory statistics for replicating Goldman et al. (2022) on a subset of 3 language families: Uralic, Romance, and Niger-Congo. We begin by investigating predictors of accuracy drop between SIGMORPHON (2020) and Goldman et al. (2022), then investigate the relationship between raw accuracy on the Goldman et al. test data and various predictors.
library(ggplot2)
library(tidyverse)
library(car)
library(mgcv)
require(splines)
# Read in the data
df<-read.csv("replication/replication_complete.csv")
# Make the family column pretty
df$Family = str_to_title(str_replace(df$Family, "_", " "))
# We're going to be making a lot of log-log plots, so lets make our lives easier:
df$log_test_acc_drop = -1*log((-1*df$test_acc_drop)+1)
df$log_train_lemma_diff = -1*log(-1*df$train_lemma_diff_raw)
Here, we define helper functions to run our correlational statistics, evaluate our fitted models, and compare models fitted to a single predictor to models fitted to multiple predictors in order to evaluate which better explains the data.
# We'll use this helper function to run our stats
correlations <- function(a, b){
for (m in list("pearson", "spearman", "kendall")){
# Supressing warnings bc we'll get them whenever there are ties
suppressWarnings(res <- cor.test(a, b, method = m))
formatted <- sprintf("%s: %f (p = %e)", res$method, res$estimate, res$p.value)
print(formatted)
}
}
# We'll use this helper function to get information about our model
eval_model <- function(model, df){
rsquared = summary(model)$r.squared
AIC = AIC(model)
results <- sprintf("R^2: %f, AIC: %f", rsquared, AIC)
print(results)
layout(matrix(c(1,2,3,4),2,2))
plot(model)
return(predict(model, df, se = TRUE))
}
# We'll use this helper function to compare two models
compare_models <- function(model_both, model_single){
layout(matrix(c(1,2,3,4),2,2))
plot(model_both)
AIC = AIC(model_both)
anovap = anova(model_both, model_single)$`Pr(>F)`[-1]
results <- sprintf("AIC: %f, ANOVA p: %f", AIC, anovap)
print(results)
vif(model_both)
}
We begin by investigating relationships between test accuracy drop (between SIGMORPHON 2020 & Goldman et al 2022) and various predictors. Exploratory analysis tells us that the following predictors have strong relationships with accuracy drop:
Training size: the number of training triples in the Goldman et al. data
Training lemmas: the number of unique lemmas occurring in the Goldman et al. data
Lemma drop: the difference in the number of lemmas between the SIGMORPHON data and the Goldman et al. data.
We begin by investigating unscaled scatter plots before fitting linear models and running statistics on the log-log scaled plots, and finally fitting non-linear models to these predictors.
df %>%
ggplot(aes(Goldman_train_size, test_acc_drop)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. training size") +
ylab("Test accuracy drop") +
ggtitle("Replication of Goldman et al.") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
)
df %>%
ggplot(aes(Goldman_train_lemmas, test_acc_drop)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. training lemmas") +
ylab("Test accuracy drop") +
ggtitle("Test accuracy drop vs. training lemmas") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
)
df %>%
ggplot(aes(train_lemma_diff_raw, test_acc_drop)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Training lemma difference between SIGMORPHON & Goldman et al.") +
ylab("Test accuracy drop") +
ggtitle("Test accuracy drop vs. training lemma drop") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
)
From the above, we can see that the relationship between the various predictors and test accuracy drop isn’t linear. However, when both axes are log-scaled, the relationship is near linear. As such, we fit linear models and run correlation statistics on the log-log scaled version.
train_size_lm = lm(log_test_acc_drop ~ log(Goldman_train_size), data = df)
pred <- eval_model(train_size_lm, df)
[1] "R^2: 0.679104, AIC: 80.198992"
df %>%
ggplot(aes(log(Goldman_train_size), log_test_acc_drop)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. training size, log scale") +
ylab("Test accuracy drop, log scale") +
ggtitle("Test accuracy drop vs. training size") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log(Goldman_train_size),
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log(Goldman_train_size), y = pred$fit), color = "black")
ggsave("../writeup/figs/lm_drop_size.png", dpi=500)
Saving 6.94 x 4.29 in image
correlations(log(df$Goldman_train_size), df$log_test_acc_drop)
[1] "Pearson's product-moment correlation: 0.824078 (p = 2.118272e-09)"
[1] "Spearman's rank correlation rho: 0.798319 (p = 4.503624e-07)"
[1] "Kendall's rank correlation tau: 0.593583 (p = 1.359831e-07)"
train_lemma_lm = lm(log_test_acc_drop ~ log(Goldman_train_lemmas), data = df)
pred <- eval_model(train_lemma_lm, df)
[1] "R^2: 0.818971, AIC: 60.735439"
df %>%
ggplot(aes(log(Goldman_train_lemmas), log_test_acc_drop)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. number of training lemmas, log scale") +
ylab("Test accuracy drop, log scale") +
ggtitle("Test accuracy drop vs. number of training lemmas") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log(Goldman_train_lemmas),
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log(Goldman_train_lemmas), y = pred$fit), color = "black")
ggsave("../writeup/figs/lm_drop_lemmas.png", dpi=500)
Saving 6.94 x 4.29 in image
correlations(log(df$Goldman_train_lemmas), df$log_test_acc_drop)
[1] "Pearson's product-moment correlation: 0.904970 (p = 2.044293e-13)"
[1] "Spearman's rank correlation rho: 0.915412 (p = 3.427937e-14)"
[1] "Kendall's rank correlation tau: 0.747098 (p = 5.694553e-10)"
lemma_drop_lm = lm(log_test_acc_drop ~ log_train_lemma_diff, data = df)
pred <- eval_model(lemma_drop_lm, df)
[1] "R^2: 0.799573, AIC: 64.196379"
df %>%
ggplot(aes(log_train_lemma_diff, log_test_acc_drop)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Training lemma drop, log scale") +
ylab("Test accuracy drop, log scale") +
ggtitle("Test accuracy drop vs. training lemma drop") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log_train_lemma_diff,
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log_train_lemma_diff, y = pred$fit), color = "black")
ggsave("../writeup/figs/lm_drop_drop.png", dpi=500)
Saving 6.94 x 4.29 in image
correlations(df$log_train_lemma_diff, df$log_test_acc_drop)
[1] "Pearson's product-moment correlation: -0.894188 (p = 1.053677e-12)"
[1] "Spearman's rank correlation rho: -0.888906 (p = 2.209056e-12)"
[1] "Kendall's rank correlation tau: -0.726787 (p = 1.590860e-09)"
Intuitively, it makes sense that several of the possible predictors above would be co-linear: larger training data will generally contain more lemmas, for example. We investigate these co-linearities below.
df %>%
ggplot(aes(log(Goldman_train_size), log(Goldman_train_lemmas))) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. training size, log scale") +
ylab("Goldman et al. number of training lemmas, log scale") +
ggtitle("Training size vs. training lemmas, log scale") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
stat_smooth(method="lm", color="black", size=0.5, alpha = 0.5)
correlations(log(df$Goldman_train_size), log(df$Goldman_train_lemmas))
[1] "Pearson's product-moment correlation: 0.892150 (p = 1.408484e-12)"
[1] "Spearman's rank correlation rho: 0.890655 (p = 1.736118e-12)"
[1] "Kendall's rank correlation tau: 0.722076 (p = 2.084901e-09)"
df %>%
ggplot(aes(log(Goldman_train_size), log_train_lemma_diff)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. training size, log scale") +
ylab("Training lemma difference, log scale") +
ggtitle("Training size vs. training lemma drop") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
stat_smooth(method="lm", color="black", size=0.5, alpha = 0.5)
correlations(log(df$Goldman_train_size), df$log_train_lemma_diff)
[1] "Pearson's product-moment correlation: -0.921682 (p = 1.046724e-14)"
[1] "Spearman's rank correlation rho: -0.911828 (p = 6.486413e-14)"
[1] "Kendall's rank correlation tau: -0.762501 (p = 2.429011e-10)"
df %>%
ggplot(aes(log(Goldman_train_lemmas), log_train_lemma_diff)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. training lemmas, log scale") +
ylab("Training lemma difference, log scale") +
ggtitle("Training lemmas vs. training lemma drop") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
stat_smooth(method="lm", color="black", size=0.5, alpha = 0.5)
correlations(log(df$Goldman_train_lemmas), df$log_train_lemma_diff)
[1] "Pearson's product-moment correlation: -0.985909 (p = 2.005603e-26)"
[1] "Spearman's rank correlation rho: -0.978066 (p = 2.252220e-23)"
[1] "Kendall's rank correlation tau: -0.945390 (p = 4.816969e-15)"
We now have three possible predictor variables which we know are co-linear with one another, so we want to disentangle these relationships to determine which variable(s) best predict drops in model performance.
Above, we saw that of the single-predictor linear models, test accuracy drop was best predicted by training lemmas. Using this as a starting point, we add additional predictors to the linear models to determine if any additional predictor leads to significantly better model performance, defined as follows:
Does the AIC drop by at least 2 units (significantly better)?
Is the ANOVA p-value between the two models significant? (p < 0.05)
Is the VIF (a measure of co-linearity) low – below 3-4?
We compare a model conditioned on the number of lemmas in train and the number of triples in train to one just conditioned on the number of lemmas.
lemmas_plus_train_lm = lm(log_test_acc_drop ~ log(Goldman_train_size) + log(Goldman_train_lemmas), data = df)
summary(lemmas_plus_train_lm)$r.squared
[1] 0.8203387
compare_models(lemmas_plus_train_lm, train_lemma_lm)
[1] "AIC: 62.477507, ANOVA p: 0.630477"
log(Goldman_train_size) log(Goldman_train_lemmas)
4.900301 4.900301
We compare a model conditioned on the number of lemmas in train and the lemma drop between SIGMORPHON and Goldman et al in train to one just conditioned on the number of lemmas.
lemmas_drop_lm = lm(log_test_acc_drop ~ log(Goldman_train_lemmas) + log_train_lemma_diff, data = df)
summary(lemmas_drop_lm)$r.squared
[1] 0.8191093
compare_models(lemmas_drop_lm, train_lemma_lm)
[1] "AIC: 62.709373, ANOVA p: 0.878457"
log(Goldman_train_lemmas) log_train_lemma_diff
35.73512 35.73512
Though the relationship is near-linear when we log-scale both axes,
we can see from our residual plots above that there is some
non-linearity remaining. As such, we train more general models using the
natural cubic splines with df = 3.
ns_train_size_lm = lm(log_test_acc_drop ~ ns(log(Goldman_train_size), df=3), data = df)
pred <- eval_model(ns_train_size_lm, df)
[1] "R^2: 0.801390, AIC: 67.886671"
df %>%
ggplot(aes(log(Goldman_train_size), log_test_acc_drop)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. training size, log scale") +
ylab("Test accuracy drop, log scale") +
ggtitle("Test accuracy drop vs. training size") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log(Goldman_train_size),
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log(Goldman_train_size), y = pred$fit), color = "black")
ggsave("../writeup/figs/ns_drop_size.png", dpi=500)
Saving 6.94 x 4.29 in image
ns_train_lemma_lm = lm(log_test_acc_drop ~ ns(log(Goldman_train_lemmas), df = 3), data = df)
pred <- eval_model(ns_train_lemma_lm, df)
[1] "R^2: 0.863321, AIC: 55.180630"
df %>%
ggplot(aes(log(Goldman_train_lemmas), log_test_acc_drop)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. number of training lemmas, log scale") +
ylab("Test accuracy drop, log scale") +
ggtitle("Test accuracy drop vs. number of training lemmas") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log(Goldman_train_lemmas),
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log(Goldman_train_lemmas), y = pred$fit), color = "black")
ggsave("../writeup/figs/ns_drop_lemmas.png", dpi=500)
Saving 6.94 x 4.29 in image
ns_lemma_drop_lm = lm(log_test_acc_drop ~ ns(log_train_lemma_diff, df = 3), data = df)
pred <- eval_model(ns_lemma_drop_lm, df)
[1] "R^2: 0.841120, AIC: 60.298126"
df %>%
ggplot(aes(log_train_lemma_diff, log_test_acc_drop)) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Training lemma drop, log scale") +
ylab("Test accuracy drop, log scale") +
ggtitle("Test accuracy drop vs. training lemma drop") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log_train_lemma_diff,
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log_train_lemma_diff, y = pred$fit), color = "black")
ggsave("../writeup/figs/ns_drop_drop.png", dpi=500)
Saving 6.94 x 4.29 in image
Having trained the preliminary single-predictor models, we can once again train the more complex multi-predictor models and compare them in terms of AIC, ANOVA p-value, and VIF as above:
ns_lemmas_plus_train_lm = lm(log_test_acc_drop ~ ns(log(Goldman_train_size), df = 3) + ns(log(Goldman_train_lemmas), df = 3), data = df)
summary(ns_lemmas_plus_train_lm)$r.squared
[1] 0.8868365
compare_models(ns_lemmas_plus_train_lm, ns_train_lemma_lm)
[1] "AIC: 54.761370, ANOVA p: 0.158460"
GVIF Df GVIF^(1/(2*Df))
ns(log(Goldman_train_size), df = 3) 40.10359 3 1.850109
ns(log(Goldman_train_lemmas), df = 3) 40.10359 3 1.850109
ns_lemmas_drop_lm = lm(log_test_acc_drop ~ ns(log(Goldman_train_lemmas), df = 3) + ns(log_train_lemma_diff, df = 3), data = df)
summary(ns_lemmas_drop_lm)$r.squared
[1] 0.8697183
compare_models(ns_lemmas_drop_lm, ns_train_lemma_lm)
[1] "AIC: 59.550770, ANOVA p: 0.724908"
GVIF Df GVIF^(1/(2*Df))
ns(log(Goldman_train_lemmas), df = 3) 219354.3 3 7.765886
ns(log_train_lemma_diff, df = 3) 219354.3 3 7.765886
Having investigated the relationships with accuracy drop, we now wish to understand the predictors of raw accuracy. To begin, we visualize the relationship with raw accuracy for both SIGMORPHON and Goldman et al.
First, format the data for easier plotting:
Gold <- df %>%
select(train=Goldman_train_size,
lemmas = Goldman_train_lemmas,
test = Goldman_test_acc,
Family = Family
)
Gold$Type = "Goldman"
Sigm <- df %>%
select(train = SIGMORPHON_train_size,
lemmas = SIGMORPHON_train_lemmas,
test = SIGMORPHON_test_acc,
Family = Family
)
Sigm$Type = "SIGMORPHON"
new_df = rbind(Gold, Sigm)
new_df %>%
ggplot(aes(log(train), log(test + 1), color = Type)) +
geom_point(aes(shape = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
stat_smooth(aes(color=Type), method="lm", size=0.5, alpha = 0.5)+
theme_bw() +
xlab("Training size, log scale") +
ylab("Test accuracy, log scale") +
ggtitle("Test accuracy vs. training size") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
)
print("-------GOLDMAN ------")
[1] "-------GOLDMAN ------"
correlations(log(df$Goldman_train_size), log(df$Goldman_test_acc + 1))
[1] "Pearson's product-moment correlation: 0.703670 (p = 3.443305e-06)"
[1] "Spearman's rank correlation rho: 0.817515 (p = 3.619313e-09)"
[1] "Kendall's rank correlation tau: 0.621986 (p = 2.456525e-07)"
print("-------SIGMORPHON -------")
[1] "-------SIGMORPHON -------"
correlations(log(df$SIGMORPHON_train_size), log(df$SIGMORPHON_test_acc + 1))
[1] "Pearson's product-moment correlation: 0.308926 (p = 7.544833e-02)"
[1] "Spearman's rank correlation rho: 0.039633 (p = 8.238934e-01)"
[1] "Kendall's rank correlation tau: 0.008993 (p = 9.408048e-01)"
new_df %>%
ggplot(aes(log(lemmas), log(test + 1), color = Type)) +
geom_point(aes(shape = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
stat_smooth(aes(color=Type), method="lm", size=0.5, alpha = 0.5) +
theme_bw() +
xlab("Training lemmas, log scale") +
ylab("Test accuracy, log scale") +
ggtitle("Test accuracy vs. training lemmas") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
)
print("-------GOLDMAN ------")
[1] "-------GOLDMAN ------"
correlations(log(df$Goldman_train_lemmas), log(df$Goldman_test_acc + 1))
[1] "Pearson's product-moment correlation: 0.663836 (p = 1.868570e-05)"
[1] "Spearman's rank correlation rho: 0.849805 (p = 2.052872e-10)"
[1] "Kendall's rank correlation tau: 0.646953 (p = 8.530164e-08)"
print("-------SIGMORPHON -------")
[1] "-------SIGMORPHON -------"
correlations(log(df$SIGMORPHON_train_lemmas), log(df$SIGMORPHON_test_acc + 1))
[1] "Pearson's product-moment correlation: 0.155920 (p = 3.785560e-01)"
[1] "Spearman's rank correlation rho: -0.002142 (p = 9.904054e-01)"
[1] "Kendall's rank correlation tau: -0.057608 (p = 6.345697e-01)"
print("-------GOLDMAN ------")
[1] "-------GOLDMAN ------"
correlations(log(df$Goldman_train_size), log(df$Goldman_train_lemmas))
[1] "Pearson's product-moment correlation: 0.892150 (p = 1.408484e-12)"
[1] "Spearman's rank correlation rho: 0.890655 (p = 1.736118e-12)"
[1] "Kendall's rank correlation tau: 0.722076 (p = 2.084901e-09)"
print("-------SIGMORPHON -------")
[1] "-------SIGMORPHON -------"
correlations(log(df$SIGMORPHON_train_size), log(df$SIGMORPHON_train_lemmas))
[1] "Pearson's product-moment correlation: 0.904018 (p = 2.380841e-13)"
[1] "Spearman's rank correlation rho: 0.891283 (p = 1.590692e-12)"
[1] "Kendall's rank correlation tau: 0.717217 (p = 2.521127e-09)"
As we did for accuracy drop, we now fit linear models to raw accuracy on the Goldman et al. data only (since there seems to be little effect for the SIGMORPHON data). We then compare the models fitted to just training size or training lemmas to the model fitted to both in the same way as above.
train_size_lm = lm(log(Goldman_test_acc + 1) ~ log(Goldman_train_size), data = df)
pred <- eval_model(train_size_lm, df)
[1] "R^2: 0.495151, AIC: 101.312023"
df %>%
ggplot(aes(log(Goldman_train_size), log(Goldman_test_acc + 1))) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. training size, log scale") +
ylab("Goldman et al. test accuracy, log scale") +
ggtitle("Goldman et al. test accuracy vs. training size") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log(Goldman_train_size),
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log(Goldman_train_size), y = pred$fit), color = "black")
ggsave("../writeup/figs/lm_raw_size.png", dpi=500)
Saving 6.94 x 4.29 in image
train_lemma_lm = lm(log(Goldman_test_acc + 1) ~ log(Goldman_train_lemmas), data = df)
pred <- eval_model(train_lemma_lm)
[1] "R^2: 0.440678, AIC: 104.795841"
df %>%
ggplot(aes(log(Goldman_train_lemmas), log(Goldman_test_acc + 1))) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. number of training lemmas, log scale") +
ylab("Goldman et al. test accuracy, log scale") +
ggtitle("Goldman et al. test accuracy vs. training lemmas") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log(Goldman_train_lemmas),
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log(Goldman_train_lemmas), y = predict(train_lemma_lm)), color = "black")
ggsave("../writeup/figs/lm_raw_lemmas.png", dpi=500)
Saving 6.94 x 4.29 in image
lemmas_plus_train_lm <- lm(log(Goldman_test_acc + 1) ~ log(Goldman_train_size) + log(Goldman_train_lemmas), data = df)
summary(lemmas_plus_train_lm)$r.squared
[1] 0.501522
compare_models(lemmas_plus_train_lm, train_lemma_lm)
[1] "AIC: 102.880219, ANOVA p: 0.060868"
log(Goldman_train_size) log(Goldman_train_lemmas)
4.900301 4.900301
compare_models(lemmas_plus_train_lm, train_size_lm)
[1] "AIC: 102.880219, ANOVA p: 0.533663"
log(Goldman_train_size) log(Goldman_train_lemmas)
4.900301 4.900301
It’s very clear from the above plots – more so than for test accuracy
drop – that the relationships aren’t linear even on the log-log scale.
As such, we once again make use of a natural cubic spline with
df = 3 to extend these models to be non-linear and fit two
single predictor models and one two-predictor model as above.
ns_train_size_lm = lm(log(Goldman_test_acc + 1) ~ ns(log(Goldman_train_size), df = 3), data = df)
pred <- eval_model(ns_train_size_lm, df)
[1] "R^2: 0.551319, AIC: 101.301778"
df %>%
ggplot(aes(log(Goldman_train_size), log(Goldman_test_acc + 1))) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. training size, log scale") +
ylab("Goldman et al. test accuracy, log scale") +
ggtitle("Goldman et al. test accuracy vs. training size") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log(Goldman_train_size),
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log(Goldman_train_size), y = pred$fit), color = "black")
ggsave("../writeup/figs/ns_raw_size.png", dpi=500)
Saving 6.94 x 4.29 in image
ns_train_lemma = lm(log(Goldman_test_acc + 1) ~ ns(log(Goldman_train_lemmas), df = 3), data = df)
pred <- eval_model(ns_train_lemma, df)
[1] "R^2: 0.641152, AIC: 93.705729"
df %>%
ggplot(aes(log(Goldman_train_lemmas), log(Goldman_test_acc + 1))) +
geom_point(aes(colour = Family), size = 5, alpha = 0.5) +
scale_color_manual(values=c("turquoise", "purple", "gold")) +
theme_bw() +
xlab("Goldman et al. number of training lemmas, log scale") +
ylab("Goldman et al. test accuracy, log scale") +
ggtitle("Goldman et al. test accuracy vs. training lemmas") +
theme(plot.title = element_text(hjust=0.5, size=18),
axis.title.y = element_text(size=14),
axis.title.x = element_text(size=14),
) +
geom_ribbon(aes(x = log(Goldman_train_lemmas),
ymin = pred$fit - 2 * pred$se.fit,
ymax = pred$fit + 2 * pred$se.fit),
fill = "grey",
alpha = .4) +
geom_line(aes(x = log(Goldman_train_lemmas), y = pred$fit), color = "black")
ggsave("../writeup/figs/ns_raw_lemmas.png", dpi=500)
Saving 6.94 x 4.29 in image
ns_lemmas_plus_train_lm <- lm(log(Goldman_test_acc + 1) ~ ns(log(Goldman_train_size), df = 3) + ns(log(Goldman_train_lemmas), df = 3), data = df)
summary(ns_lemmas_plus_train_lm)$r.squared
[1] 0.6854902
compare_models(ns_lemmas_plus_train_lm, ns_train_lemma)
[1] "AIC: 95.221721, ANOVA p: 0.304902"
GVIF Df GVIF^(1/(2*Df))
ns(log(Goldman_train_size), df = 3) 40.10359 3 1.850109
ns(log(Goldman_train_lemmas), df = 3) 40.10359 3 1.850109
compare_models(ns_lemmas_plus_train_lm, ns_train_size_lm)
[1] "AIC: 95.221721, ANOVA p: 0.020695"
GVIF Df GVIF^(1/(2*Df))
ns(log(Goldman_train_size), df = 3) 40.10359 3 1.850109
ns(log(Goldman_train_lemmas), df = 3) 40.10359 3 1.850109